library(limma)
library(edgeR)
library(tximport)
library(AnnotationHub)
library(magrittr)
library(scales)
library(tidyverse)
library(ggrepel)
library(RColorBrewer)
library(fgsea)
library(msigdbr)
library(pander)
library(kableExtra)
library(ggfortify)
library(ggpubr)
library(pheatmap)
library(here)
library(goseq)
library(RUVSeq)
library(harmonicmeanp)
library(ngsReports)
library(UpSetR)
library(gridExtra)
if (interactive()) setwd(here("R"))
panderOptions("table.split.table", Inf)
theme_set(theme_bw())
The alignments in this analysis were generated by Steve. He aligned each library (including technical replicates) to the Zebrafish transcriptome from Ensembl Release 94 (GRCz11) using kallisto (v0.43.1). In addition to the standard transcriptome, the two mutant psen2 transcripts were manually added to the reference.
The corresponding set of gene descriptions were then loaded into R as an EnsDb object using the AnnotationHub() infrastructure. Likewise, the set of transcript descriptions were loaded, with the manual addition of the two novel psen2 mutants.
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH64906"]]
genes <- genes(ensDb)
mcols(genes) <- mcols(genes)[!names(mcols(genes)) %in% c("seq_coord_system", "gene_name")]
transcripts <- transcripts(ensDb)
psen2 <- subset(transcripts, gene_id == "ENSDARG00000015540") %>%
granges() %>%
magrittr::extract(c(1, 1),)
names(psen2) <- c("psen2T141_L142delinsMISLISV", "psen2N140fs")
mcols(psen2) <- DataFrame(tx_id = names(psen2),
tx_biotype = c("protein_coding", "nonsense_mediated_decay"),
gene_id = "ENSDARG00000015540",
tx_id_version = names(psen2),
tx_name = names(psen2))
transcripts %<>%
GRangesList(psen2) %>%
unlist() %>%
sort()
transcripts$symbol <- mcols(genes[transcripts$gene_id])$symbol
# tx2gene <- mcols(transcripts)[c("tx_id_version", "gene_id")]
tx2gene <- mcols(transcripts)[c("tx_id_version", "symbol")] %>%
subset(symbol != "")
geneCounts <- list.files(path = here("3_kallisto"), recursive = TRUE, pattern = "h5", full.names = TRUE) %>%
set_names(basename(dirname(.))) %>%
set_names(str_replace(names(.), "_-", "WT")) %>%
tximport(type = "kallisto", tx2gene = tx2gene)
Transcript-level counts were imported using catchKallisto() from edgeR in order to check that expression of the psen2 alleles matches the genotype of the fish which was determined by PCRs performed on genomic DNA from each fish extracted from their tail.
From the plot below, we see that fish 8_FS_4 is actually a transheterozygous fish and should be omitted.
transCounts <- list.files(here("3_kallisto/"), full.names = TRUE) %>%
catchKallisto()
colnames(transCounts$counts) %<>%
basename() %>%
str_replace("_-", "WT")
psen2IDs <- subset(transcripts, symbol == "psen2") %>% names()
transCounts$counts %>%
cpm() %>%
set_rownames(str_remove(rownames(.), "\\.[0-9]+")) %>%
magrittr::extract(psen2IDs,) %>%
as.data.frame() %>%
rownames_to_column("Transcript") %>%
gather(key = "Sample", value = "CPM", -Transcript) %>%
mutate(Transcript = str_replace(Transcript, "ENSDART.+", "psen2-WT"),
Genotype = str_extract(Sample, "(WT|FAD|FS)")) %>%
ggplot(aes(Sample, CPM, fill = Transcript)) +
geom_bar(stat = "identity", colour = "black") +
facet_wrap(~Genotype, nrow = 1, scales = "free_x") +
scale_fill_viridis_d(option = "viridis") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
aspect.ratio = 1)
CPM values for each psen2 transcript across all samples.
minCpm <- 1
minSamples <- 5
dge <- geneCounts$counts %>%
magrittr::extract(rowSums(cpm(.) > minCpm) >= minSamples,) %>%
DGEList() %>%
calcNormFactors()
dge$samples %<>%
mutate(Sample = rownames(.),
Genotype = str_extract(Sample, "(WT|FAD|FS)"),
Genotype = factor(Genotype, levels = c("WT", "FS", "FAD")),
group = as.integer(Genotype),
Genotype_forPub = case_when(
Genotype == "FAD" ~ "EOfAD",
Genotype == "FS" ~ "FS" ,
TRUE ~ "WT"
),
Genotype_forPub = factor(Genotype_forPub, levels = c("WT", "FS", "EOfAD", "transhet"))) %>%
set_rownames(.$Sample)
dge$samples["8_FS_4",]$Genotype_forPub <- "transhet"
dge$samples %<>%
mutate(Sex = case_when(
Sample %in% c("15_WT_1",
"14_WT_2",
"13_WT_3",
"10_FS_1",
"5_FS_2",
"9_FS_3",
"6_FAD_1",
"4_FAD_2",
"3_FAD_3") ~ "Female" ,
TRUE ~ "Male")) %>%
set_rownames(.$Sample)
Gene-level counts were imported using tximport, mapping transcripts to genes. Some genes exist in the primary assembly and on alternate assemblies for specific regions, and these were considered as separate transcripts of the same gene for read summarisation purposes. Transcript counts were thus mapped to genes using the gene symbol (e.g. psen2), instead of the gene id.
Upon loading into a DGE object, genes which had less than 1 in at least 5 of the samples were omitted. This equated to about 31 reads for a gene in at least 5 samples for inclusion in downstream analysis, giving a total of 18,808 of the original genes for DGE analysis.
Total counts from each library after assigning to genes
I next will perform a Principal Component Analysis to visualise the similarity between samples. Note that sample 12_WT_4 appears quite distant from the others. No clear seperation is observed of genotypes or sex. Also PC1 and PC2 only account for ~30% of the total vriation in this dataset, which is lower than one would expect
dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp() %>%
autoplot(
data = tibble(Sample = rownames(.$x)) %>%
left_join(dge$samples),
colour = "Genotype_forPub",
size = 4,
shape = "Sex",
) +
geom_label_repel(
aes(label = Sample, colour = Genotype_forPub), show.legend = FALSE
) +
scale_color_manual(values = c("darkcyan", "firebrick" , "purple4", "grey30"))
Another way of visualising the similarity between samples is a multi-dimensional scaling plot (MDS) plot. The distances between samples can be interpreted as the leading log2-fold-change, so that similar samples will cluster together. Like in the PCA plot, sample 12_WT_4 appears quite distant from the rest, and no seperation between genotypes or sex is observed.
# save this plot as an object for later.
mds1 <- dge %>%
plotMDS(plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dim", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
left_join(dge$samples) %>%
ggplot(aes(Dim1, Dim2)) +
geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
labs(x = "Dimension 1",
y = "Dimension 2",
colour = "Genotype") +
geom_label_repel(aes(label = Sample, colour = Genotype_forPub)) +
scale_colour_manual(values = c("grey30", "firebrick", "darkcyan", "purple")) +
theme(aspect.ratio = 1)
mds1
I showed before that 8_FS_4 is actually a transhet sample. Therefore, I will omit it from the analysis.
In the PCA and MDS plots above, and in the previous limma analysis by Steve, sample 12_WT_4 was an obvious outlier as it did not cluster with the rest of the samples, and steve showed it was downweighted majorly compared to the others and will also e omitted.
dropSamples <- c("8_FS_4", "12_WT_4")
dge <- dge[,!colnames(dge) %in% dropSamples]
the MDS and PCA plots are now regenerated below
dge %>%
plotMDS(plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dim", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
left_join(dge$samples) %>%
ggplot(aes(Dim1, Dim2)) +
geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
#geom_label_repel(aes(label = Sample, colour = Genotype), show.legend = FALSE) +
theme_bw() +
labs(x = "Dimension 1",
y = "Dimension 2",
colour = "Genotype") +
scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" ))
dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp() %>%
autoplot(
data = tibble(Sample = rownames(.$x)) %>%
left_join(dge$samples),
colour = "Genotype",
size = 4,
shape = "Sex",
) +
geom_label_repel(
aes(label = Sample, colour = Genotype),
show.legend = FALSE
) +
scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" ))
dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp() %>%
summary() %>%
extract2("importance") %>%
magrittr::extract(,paste0("PC", 1:5)) %>%
kable(caption = paste("*First five principal components, showing that the first two only account for", percent(.[3,2]), "of the total variance, which is below expectations*")) %>%
kable_styling()
| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| Standard deviation | 22.38785 | 18.64065 | 17.46163 | 16.30118 | 14.52113 |
| Proportion of Variance | 0.19215 | 0.13321 | 0.11689 | 0.10187 | 0.08084 |
| Cumulative Proportion | 0.19215 | 0.32536 | 0.44225 | 0.54412 | 0.62495 |
Another visualisation here is performed where samples are clustered based on their Euclidean distance.
anno <-
dge %>%
cpm(log = T) %>%
t() %>%
rownames() %>%
as.data.frame() %>%
set_colnames("Sample") %>%
left_join(dge$samples) %>%
dplyr::select(Sample, c(Genotype = Genotype_forPub), Sex) %>%
column_to_rownames("Sample")
dge %>%
cpm(log = T) %>%
t() %>%
dist(method = "euclidean") %>%
as.matrix() %>%
pheatmap(
color = viridis::viridis_pal(option = "viridis")(100),
annotation_col = anno,
annotation_colors = list(Genotype = c(FS = "firebrick",
WT = "grey30",
EOfAD = "darkcyan"),
Sex = c(Male = "#FF9700",
Female = "#0370FF")),
show_rownames = F,
show_colnames = F,
annotation_row = anno,
cellwidth = 10, cellheight = 10,
treeheight_row = 10, treeheight_col = 10)
Steve previously performed a limma analysis. Here, I will perform a DGE analysis using the GLM and likelihood ratio tests of edgeR, which in our experience, has more power to detect DE genes.
# model matrix
design <- model.matrix(~Genotype, data = dge$samples) %>%
set_colnames(gsub("Genotype", "", colnames(.)))
# fit the glm
fit <- dge %>%
estimateDisp(design) %>%
glmFit(design)
topTables_glm <- design %>% colnames() %>% .[2:3] %>%
sapply(function(x){
glmLRT(fit, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as.data.frame() %>%
rownames_to_column("symbol") %>%
arrange(PValue) %>%
mutate(
coef = x,
DE = FDR < 0.05
) %>%
dplyr::select(
symbol, logFC, logCPM, PValue, FDR, DE, everything()
) %>%
as_tibble()
}, simplify = FALSE)
topTables_glm %>%
bind_rows() %>%
ggplot(aes(logFC, -log10(PValue))) +
geom_point(aes(colour = DE)) +
geom_label_repel(aes(label = symbol),
data = . %>%
dplyr::filter(DE == T) %>%
dplyr::slice(1:20)) +
facet_wrap(~coef, scales = "free_x") +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw() +
theme(legend.position = "bottom")
topTables_glm %>%
bind_rows() %>%
ggplot(aes(logCPM, logFC)) +
geom_point(aes(colour = DE)) +
geom_label_repel(aes(label = symbol),
data = . %>%
dplyr::filter(DE == T),
size = 3, alpha = 0.5) +
facet_wrap(~coef, ncol = 1) +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw()
MD plot for psen2N140fs/+ samples and psen2T141_L142delinsMISLISV/+ compared to psen2+/+ samples
I want to check for a GC or length bias for differential expression. If such a bias is observed, I can correct for it using cqn. GC and length information is easily available from EnsDb objects from release 98 onwards, so this release was used. No expressed genes were noted as absent from the annotations for Release 98, despite the differences between releases. GC content and Length were taken as weighted averages and simple averages respectively. No discernible bias is observed, supporting that gene set testing can be performed in this dataset.
ens98 <- ah[["AH74989"]]
genes98 <- genes(ens98)
ex <- exonsBy(ens98, "tx")
tr <- transcripts(ens98)
tr$len <- ex %>%
width %>%
lapply(sum) %>%
.[names(tr)] %>%
unlist()
gnBias <- mcols(tr) %>%
as.data.frame() %>%
group_by(gene_id) %>%
summarise(
n = n(),
meanGC = sum(len*gc_content) / sum(len),
len = mean(len)
)
mcols(genes) %<>%
as.data.frame() %>%
left_join(gnBias) %>%
DataFrame()
gnGC <- genes %>%
mcols() %>%
as_tibble() %>%
unchop(entrezid)
topTables_glm %>%
lapply(function(x) {
x %>%
left_join(gnGC) %>%
as_tibble()
}) %>%
bind_rows() %>%
ggplot(aes(meanGC, sign(logFC)*-log10(PValue))) +
geom_point(aes(colour = DE)) +
geom_smooth(se = F) +
facet_wrap(~coef, ncol = 1) +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw()
topTables_glm %>%
lapply(function(x) {
x %>%
left_join(gnGC) %>%
as_tibble()
}) %>%
bind_rows() %>%
ggplot(aes(len, sign(logFC)*-log10(PValue))) +
geom_point(aes(colour = DE)) +
geom_smooth(se = F) +
geom_hline(yintercept = 0, linetype = 2) +
scale_x_log10() +
facet_wrap(~coef, ncol = 1) +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw()
Here, I will perform enichment analysis both within the DE genes using goseq, which allows for a covariate to be a predictor variable, and then within ranked lists using fry (which handles inter-gene correlations better than other methods such as GSEA). The gene sets which will be used are the KEGG and GO gene sets from MSigDB, and the IRE gene sets which were previously defined by Nhi from our lab which contains genes wich contain an iron-responsive element in the untranslated regions.
the KEGG and GO gene sets were imported using the msigdbr package. The GO terms were only included if they had 3 or more steps back to the ontology root terms. The ire gene sets were provided by Nhi as an .rds object which was directly imported.
ens2Entrez <- genes %>%
as_tibble() %>%
dplyr::select(symbol, entrezid) %>%
unchop(entrezid) %>%
dplyr::filter(symbol %in% rownames(dge)) %>%
dplyr::filter(!is.na(entrezid))
KEGG <-
msigdbr("Danio rerio", category = "C2", subcategory = "CP:KEGG") %>%
dplyr::rename(entrezid = entrez_gene) %>%
inner_join(ens2Entrez) %>%
distinct(gs_name, symbol, .keep_all = TRUE) %>%
split(f = .$gs_name) %>%
lapply(extract2, "symbol")
# obtained the IRE gene sets from Nhi
ireGenes <- readRDS(here("R/zebrafishIreGenes.rds")) %>%
lapply(function(x){
x %>%
as.data.frame() %>%
set_colnames("gene_id") %>%
as_tibble() %>%
left_join(genes %>%
as_tibble() %>%
dplyr::select(gene_id, symbol)) %>%
.$symbol
})
# GO terms
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
readRDS() %>%
mutate(
Term = Term(id),
gs_name = Term %>% str_to_upper() %>% str_replace_all("[ -]", "_"),
gs_name = paste0("GO_", gs_name)
)
minPath <- 3
go <- msigdbr("Danio rerio", category = "C5") %>%
dplyr::rename(entrezid = entrez_gene) %>%
inner_join(ens2Entrez) %>%
distinct(gs_name, symbol, .keep_all = TRUE) %>%
split(f = .$gs_name) %>%
lapply(extract2, "symbol")
To test within the DE genes for over-representation of these gene sets, I will use goseq. goseq quantifies the probability of a gene being considered as DE based on a single covariate and corrects for it by estimatting a probability weight function (PWF). Here, I will use the average transcript length per gene. Very minimal over-representation is obsevred in both lists of DE genes for each mutation.
pwfs <- topTables_glm %>%
lapply(function(x) {
x %>%
left_join(gnGC, by = "symbol") %>%
as_tibble() %>%
distinct(symbol, .keep_all = TRUE)
}) %>%
lapply(function(x) {
x %>%
with(
nullp(
DEgenes = structure(DE, names = symbol),
bias.data = len
)
)
})
goseq(pwfs$FS, gene2cat = c(KEGG, go, ireGenes)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
head(5) %>%
dplyr::select(-under_represented_pvalue) %>%
kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_MUSCLE_FILAMENT_SLIDING | 1.00e-07 | 4 | 27 | 0.0013886 |
| GO_ACTIN_MEDIATED_CELL_CONTRACTION | 2.22e-05 | 4 | 94 | 0.0793438 |
| GO_SKELETAL_MUSCLE_CONTRACTION | 2.30e-05 | 3 | 31 | 0.0793438 |
| GO_PHOSPHOTRANSFERASE_ACTIVITY_NITROGENOUS_GROUP_AS_ACCEPTOR | 3.42e-05 | 2 | 5 | 0.0884924 |
| GO_ACTIN_FILAMENT_BASED_MOVEMENT | 4.71e-05 | 4 | 114 | 0.0973880 |
goseq(pwfs$FAD, gene2cat = c(KEGG,go, ireGenes)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
head(5) %>%
dplyr::select(-under_represented_pvalue) %>%
kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FAD mutation") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_RNA_BINDING | 0.1133806 | 1 | 1420 | 1 |
| GO_DNA_CATABOLIC_PROCESS_ENDONUCLEOLYTIC | 1.0000000 | 0 | 26 | 1 |
| GO_GLUTAMATERGIC_SYNAPSE | 1.0000000 | 0 | 353 | 1 |
| GO_ION_TRANSPORT | 1.0000000 | 0 | 1297 | 1 |
| GO_POSITIVE_REGULATION_OF_CELL_PROJECTION_ORGANIZATION | 1.0000000 | 0 | 371 | 1 |
To obtain a more complete view on the changes to gene expression due to each mutation, I will use fry from the limma package, which can take into account inter-gene correlations. Due to the highly overlapping nature of GO terms, I will only look at the ire and KEGG gene sets.
Using fry, only genes inolved in the Notch signalling pathway is found to be alered in by the FS mutation.
fryRes <- design %>% colnames() %>% .[2:3] %>%
sapply(function(x) {
cpm(dge$counts, log = TRUE) %>%
fry(
index = c(KEGG, ireGenes),
design = design,
contrast = x,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR.Mixed < 0.05)
}, simplify = FALSE)
fryRes$FS %>%
head(5) %>%
kable(caption = "Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 N140fs mutation" ) %>%
kable_styling()
| pathway | NGenes | Direction | PValue | FDR | PValue.Mixed | FDR.Mixed | coef | sig |
|---|---|---|---|---|---|---|---|---|
| KEGG_NOTCH_SIGNALING_PATHWAY | 43 | Down | 0.2102627 | 0.9832361 | 0.0000013 | 0.0002482 | FS | TRUE |
| KEGG_NON_HOMOLOGOUS_END_JOINING | 11 | Up | 0.2363334 | 0.9832361 | 0.0521496 | 0.9750871 | FS | FALSE |
| KEGG_TASTE_TRANSDUCTION | 21 | Down | 0.1660423 | 0.9832361 | 0.0608465 | 0.9750871 | FS | FALSE |
| KEGG_FATTY_ACID_METABOLISM | 37 | Up | 0.0128414 | 0.9082459 | 0.0867987 | 0.9750871 | FS | FALSE |
| KEGG_ARACHIDONIC_ACID_METABOLISM | 25 | Up | 0.1325214 | 0.9832361 | 0.0940346 | 0.9750871 | FS | FALSE |
fryRes$FAD %>%
head(5) %>%
kable(caption = "Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 FAD-like mutation" ) %>%
kable_styling()
| pathway | NGenes | Direction | PValue | FDR | PValue.Mixed | FDR.Mixed | coef | sig |
|---|---|---|---|---|---|---|---|---|
| KEGG_CELL_CYCLE | 116 | Down | 0.0808469 | 0.9943924 | 0.0054747 | 0.6034515 | FAD | FALSE |
| KEGG_OOCYTE_MEIOSIS | 101 | Up | 0.8279451 | 0.9943924 | 0.0083455 | 0.6034515 | FAD | FALSE |
| KEGG_VIBRIO_CHOLERAE_INFECTION | 49 | Up | 0.4082509 | 0.9943924 | 0.0135124 | 0.6034515 | FAD | FALSE |
| KEGG_BASAL_CELL_CARCINOMA | 47 | Down | 0.0175232 | 0.9943924 | 0.0152448 | 0.6034515 | FAD | FALSE |
| KEGG_ENDOMETRIAL_CANCER | 50 | Down | 0.1119994 | 0.9943924 | 0.0275173 | 0.6034515 | FAD | FALSE |
Since very little conclusions were found about the FAD like mutation, I will use RUVseq to see whether I can detect more differential expression of genes/gene sets. I will use the RUVg method, which uses negative control genes. In this case, it will be the least 10,000 DE genes from the initial differential expression due to psen2 genotype.
RUVneg <-
dge %>%
estimateDisp(design) %>%
glmFit(design) %>%
glmLRT(coef = 2:3) %>% # use both psen2 genotype coefs
topTags(n = Inf, adjust.method = "fdr", sort.by = "p") %>%
.$table %>%
as.data.frame() %>%
rownames_to_column("symbol") %>%
arrange(desc(PValue)) %>%
.[1:10000, ] %>%
.$symbol
### Perform the RUVg method with 1 factor of unwanted variation removed,
RUV_k1 <- dge$counts %>%
round %>%
RUVg(RUVneg, 1)
A PCA analysis was performed on the RUVSeq normalised counts. Not enough of the total variance was explained by PC1 + PC2. However, some seperation between the FS samples was observed.
pca_RUV_k1 <- RUV_k1$normalizedCounts %>%
as.data.frame() %>%
cpm(log = T) %>%
t() %>%
prcomp()
pca_RUV_k1 %>%
autoplot(
data = tibble(Sample = rownames(.$x)) %>%
left_join(dge$samples),
colour = "Genotype_forPub",
size = 4,
shape = "Sex"
) +
geom_label_repel(
aes(label = Sample, colour = Genotype_forPub),
show.legend = FALSE
) +
scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" )) +
theme(aspect.ratio = 1) +
labs(colour = "Genotype")
#ggsave("plots/PCA_RUV.png", width = 10, height = 10, units = "cm", dpi = 600)
The MDS plot shows that the FS samples also seperate from the others.
RUV_k1$normalizedCounts %>%
cpm(log=T) %>%
plotMDS(plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dimension ", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
left_join(dge$samples) %>%
ggplot(aes(`Dimension 1`, `Dimension 2`)) +
geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
scale_colour_manual(values = c("grey50", "firebrick", "darkcyan" )) +
geom_label_repel(aes(label = Sample, colour = Genotype_forPub), show.legend = FALSE) +
theme(aspect.ratio = 1) +
labs(colour = "Genotype")
RUV_k1$normalizedCounts %>%
cpm(log = T) %>%
t() %>%
dist(method = "euclidean") %>%
as.matrix() %>%
pheatmap(
color = viridis::viridis_pal(option = "viridis")(100),
annotation_col = anno,
annotation_colors = list(Genotype = c(FS = "firebrick",
WT = "grey30",
EOfAD = "darkcyan")),
show_rownames = F, show_colnames = F,
annotation_row = anno,
cellwidth = 10, cellheight = 10,
treeheight_row = 5, treeheight_col = 5)
I next will perform an additional DGE analysis including the W_1 covariate generated by RUVSeq in the design matrix.
dge$samples %<>%
cbind(RUV_k1$W)
# model matrix
design_RUV <- model.matrix(~Genotype + W_1, data = dge$samples)%>%
set_colnames(gsub("Genotype", "", colnames(.)))
# fit the glm
fit_RUV <- dge %>%
estimateDisp(design_RUV) %>%
glmFit(design_RUV)
topTables_glm_RUV <- design_RUV %>% colnames() %>% .[2:3] %>%
sapply(function(x){
glmLRT(fit_RUV, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as.data.frame() %>%
rownames_to_column("symbol") %>%
arrange(PValue) %>%
mutate(
coef = x,
DE = FDR < 0.05
) %>%
dplyr::select(
symbol, logFC, logCPM, PValue, FDR, DE, everything()
) %>%
as_tibble()
}, simplify = FALSE)
topTables_glm_RUV %>%
bind_rows() %>%
ggplot(aes(logFC, -log10(PValue))) +
geom_point(aes(colour = DE)) +
facet_wrap(~coef, scales = "free_x") +
# geom_label_repel(aes(label = symbol),
# data = . %>%
# dplyr::filter(DE == T) %>%
# dplyr::slice(1:20)) +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw() +
theme(legend.position = "bottom")
topTables_glm %>%
bind_rows() %>%
ggplot(aes(logCPM, logFC)) +
geom_point(aes(colour = DE)) +
geom_label_repel(aes(label = symbol),
data = . %>%
dplyr::filter(DE == T),
size = 3, alpha = 0.5) +
facet_wrap(~coef, ncol = 1) +
scale_colour_manual(values = c("grey50", "red")) +
theme_bw()
MD plot for psen2N140fs/+ samples and psen2T141_L142delinsMISLISV/+ compared to psen2+/+ samples
The goseq analysis was repeated. However, no over-representation was observed. The top 5 gene sets in each genotype are shown in the tables below.
pwfs_RUV <- topTables_glm_RUV %>%
lapply(function(x) {
x %>%
left_join(gnGC, by = "symbol") %>%
as_tibble() %>%
distinct(symbol, .keep_all = TRUE)
}) %>%
lapply(function(x) {
x %>%
with(
nullp(
DEgenes = structure(DE, names = symbol),
bias.data = len
)
)
})
goseq(pwfs_RUV$FS, gene2cat = c(KEGG, ireGenes, go)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
head(5) %>%
dplyr::select(-under_represented_pvalue) %>%
kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_PHOSPHOTRANSFERASE_ACTIVITY_NITROGENOUS_GROUP_AS_ACCEPTOR | 0.0000196 | 2 | 5 | 0.2032223 |
| GO_TRNA_3_END_PROCESSING | 0.0000703 | 2 | 9 | 0.3635225 |
| GO_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS | 0.0009853 | 3 | 140 | 1.0000000 |
| KEGG_ALZHEIMERS_DISEASE | 0.0011036 | 3 | 145 | 1.0000000 |
| GO_NCRNA_3_END_PROCESSING | 0.0012017 | 2 | 36 | 1.0000000 |
goseq(pwfs_RUV$FAD, gene2cat = c(KEGG, ireGenes, go)) %>%
as_tibble() %>%
mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>%
arrange(numDEInCat) %>%
head(5) %>%
dplyr::select(-under_represented_pvalue) %>%
kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>%
kable_styling()
| category | over_represented_pvalue | numDEInCat | numInCat | FDR |
|---|---|---|---|---|
| GO_SMALL_RIBOSOMAL_SUBUNIT_RRNA_BINDING | 1 | 0 | 10 | 1 |
| KEGG_CALCIUM_SIGNALING_PATHWAY | 1 | 0 | 143 | 1 |
| GO_NEGATIVE_REGULATION_OF_MOLECULAR_FUNCTION | 1 | 0 | 843 | 1 |
| GO_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS | 1 | 0 | 1458 | 1 |
| GO_RESPONSE_TO_LIGHT_STIMULUS | 1 | 0 | 282 | 1 |
I next will repeat fry to look within the entire list of genes detectable in the experiemnt.
fryRes_RUVk1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(x) {
cpm(dge, log = TRUE) %>%
fry(
index = c(KEGG, ireGenes),
design = design_RUV,
contrast = x,
sort = "mixed"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR.Mixed < 0.05)
}, simplify = FALSE)
fryRes_RUVk1 %>%
bind_rows() %>%
dplyr::filter(sig == TRUE) %>%
dplyr::select(pathway, NGenes, Direction, PValue.Mixed, FDR.Mixed, coef) %>%
kable(caption = "Gene sets showing significant changes to expression as a group using fry, at an FDR of 5%") %>%
kable_styling()
| pathway | NGenes | Direction | PValue.Mixed | FDR.Mixed | coef |
|---|---|---|---|---|---|
| KEGG_NOTCH_SIGNALING_PATHWAY | 43 | Down | 2.60e-06 | 0.0005004 | FS |
| KEGG_OOCYTE_MEIOSIS | 101 | Up | 1.09e-05 | 0.0018510 | FAD |
| KEGG_CELL_CYCLE | 116 | Down | 1.95e-05 | 0.0018510 | FAD |
Since only few gene sets are observed to be altered by the FAD-like mutation, I will perform a gene set testing method inspired by the EGSEA method, which performs multiple gene set testing algorithms (here, I will use fry, camera and fgsea), then combine the resulting p-values by calculating the harmonic mean p-value. The HMP does push the average towards smaller p-values. However, it does not produce any smaller p-values that already exist from each of the individual test. It is robust to dependent tests (which in this case, it is), and is thought to be less restrictive than other methods.
cameraRes_RUVk1 <- design %>% colnames() %>% .[2:3] %>%
sapply(function(x) {
cpm(dge, log = TRUE) %>%
camera(
index = c(KEGG, ireGenes),
design = design_RUV,
contrast = x,
inter.gene.cor = NA
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x)
}, simplify = FALSE)
ranks <- topTables_glm_RUV %>%
lapply(function(x) { x %>%
mutate(rankstat = sign(logFC)*-log10(PValue)) %>%
arrange(desc(rankstat)) %>%
dplyr::select(c("symbol", "rankstat")) %>%
with(structure(rankstat, names = symbol))
})
# set a seed for a reproducible result.
set.seed(1)
fgsea_RUVk1 <- ranks %>%
sapply(function(x) {
fgseaMultilevel(stats = x, pathways = c(KEGG, ireGenes))},
simplify = FALSE)
fgsea_RUVk1$FS %<>%
mutate(coef = "FS")
fgsea_RUVk1$FAD %<>%
mutate(coef = "FAD")
HMP <- fryRes_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(cameraRes_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgsea_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(
harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 3)
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p_FDR)
sigpaths <- HMP %>%
dplyr::filter(sig == TRUE) %>%
.$pathway
HMP %>%
dplyr::filter(pathway %in% sigpaths) %>%
ggplot(aes(coef, pathway, size =-log10(harmonic_p))) +
geom_point(aes(colour = sig)) +
geom_text(
aes(label = harmonic_p_FDR %>% signif(1)),
size = 2.5,
vjust = 2.5
) +
labs(colour = "harmonic mean p < 0.05 \nafter FDR adjustment")+
ggpubr::theme_pubclean()+
theme(legend.position = "right")+
scale_color_manual(values = c("grey30", "turquoise3"))
nums <- HMP %>%
dplyr::select(pathway, coef, harmonic_p_FDR) %>%
dplyr::filter(pathway %in% sigpaths) %>%
mutate(num = case_when(
harmonic_p_FDR < 0.05 ~ paste(harmonic_p_FDR %>% signif(2)),
TRUE ~ "ns"
)) %>%
dplyr::select(pathway, coef, num) %>%
spread(key = "coef", value = "num") %>%
column_to_rownames("pathway") %>%
t()
HMP %>%
dplyr::select(pathway, coef, harmonic_p_FDR) %>%
dplyr::filter(pathway %in% sigpaths) %>%
mutate(harmonic_p_FDR = -log10(harmonic_p_FDR)) %>%
spread(key = "coef", value = "harmonic_p_FDR") %>%
column_to_rownames("pathway") %>%
t() %>%
pheatmap(color = viridis_pal(option = "magma")(100),
treeheight_row = 0, treeheight_col = 0,
cellwidth = 30, clustering_method = "median",
cellheight = 30,
display_numbers = nums,
#angle_col = 45,
number_color = "white")
Using the HMP method of combining p-values for GSEA, 6 gene sets significantly altered by the FAD mutation, and 6 gene sets significantly altered by the FS mutation. The ribosome gene set is altered by both mutations. However, it is much more significant in the FAD. The AD gene set is only significant in the FAD.
No ire gene sets were found to be significantly altered. However, out of all the ire gene sets, the ire3_all was the most significantly altered in the FAD.
HMP %>%
dplyr::filter(pathway %>% grepl(pathway, pattern = "ire")) %>%
dplyr::select(-c(fry_p, fgsea_p, camera_p)) %>%
kable(caption = "harmonic mean p-value of IRE gene sets") %>%
kable_styling()
| pathway | coef | harmonic_p | harmonic_p_FDR | sig |
|---|---|---|---|---|
| ire3_all | FAD | 0.0378574 | 0.3785742 | FALSE |
| ire3_all | FS | 0.1993138 | 0.7012893 | FALSE |
| ire3_hq | FS | 0.5549262 | 0.7988750 | FALSE |
| ire5_all | FS | 0.5187127 | 0.7988750 | FALSE |
| ire5_hq | FS | 0.6835777 | 0.7988750 | FALSE |
| ire3_hq | FAD | 0.3569070 | 0.7988750 | FALSE |
| ire5_all | FAD | 0.6142390 | 0.7988750 | FALSE |
| ire5_hq | FAD | 0.7467348 | 0.7988750 | FALSE |
The plots below show that the signals for the sig gene sets are mostly independent of one another. (As shown by the leading edge genes, which can be thought of as the core genes which drive the enrichment of the gene set. )
fgsea_RUVk1$FS %>%
dplyr::filter(pathway %in% c(HMP %>%
dplyr::filter(coef == "FS") %>%
dplyr::filter(sig == TRUE) %>% .$pathway)) %>%
dplyr::select(pathway, leadingEdge) %>%
mutate(pathway = str_remove(pathway, "KEGG_"),
pathway = str_replace(pathway,
pattern = "TRANSENDOTHELIAL_MIGRATION",
replacement = "TRANSENDO_MIGR...")) %>%
unnest %>%
split(f = .$pathway) %>%
lapply(magrittr::extract2,"leadingEdge") %>%
fromList() %>%
upset(order.by = "freq",
nsets = length(.),
mb.ratio = c(0.5, 0.5)
)
fgsea_RUVk1$FAD %>%
dplyr::filter(pathway %in% c(HMP %>%
dplyr::filter(coef == "FAD") %>%
dplyr::filter(sig == TRUE) %>% .$pathway)) %>%
dplyr::select(pathway, leadingEdge) %>%
mutate(pathway = str_remove(pathway, "KEGG_")) %>%
unnest %>%
split(f = .$pathway) %>%
lapply(magrittr::extract2,"leadingEdge") %>%
fromList() %>%
upset(order.by = "freq", nsets = length(.),
mb.ratio = c(0.5, 0.5))
topTables_glm_RUV %>%
bind_rows() %>%
dplyr::filter(symbol %in% KEGG$KEGG_RIBOSOME) %>%
dplyr::select(symbol, logFC, coef) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("symbol") %>%
pheatmap(
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
breaks = c(
seq(min(.), 0, length.out=ceiling(100/2) + 1),
seq(max(.)/100, max(.), length.out=floor(100/2))
),
treeheight_row = 0, treeheight_col = 0,
main = "KEGG_RIBOSOME"
)
topTables_glm_RUV %>%
bind_rows() %>%
dplyr::filter(symbol %in% KEGG$KEGG_OXIDATIVE_PHOSPHORYLATION) %>%
dplyr::select(symbol, logFC, coef) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("symbol") %>%
pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name =
"RdBu")))(100),
treeheight_row = 0,
main = "KEGG_OXIDATIVE_PHOSPHORYLATION",
treeheight_col = 0,
breaks = c(
seq(min(.), 0, length.out=ceiling(100/2) + 1),
seq(max(.)/100, max(.), length.out=floor(100/2))
)
)
sessionInfo() %>%
pander()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), gridExtra(v.2.3), UpSetR(v.1.4.0), ngsReports(v.1.4.2), harmonicmeanp(v.3.0), FMStable(v.0.1-2), RUVSeq(v.1.22.0), EDASeq(v.2.22.0), ShortRead(v.1.46.0), GenomicAlignments(v.1.24.0), SummarizedExperiment(v.1.18.2), DelayedArray(v.0.14.1), matrixStats(v.0.56.0), Rsamtools(v.2.4.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), Biostrings(v.2.56.0), XVector(v.0.28.0), IRanges(v.2.22.2), S4Vectors(v.0.26.1), BiocParallel(v.1.22.0), Biobase(v.2.48.0), goseq(v.1.40.0), geneLenDataBase(v.1.24.0), BiasedUrn(v.1.07), here(v.0.1), pheatmap(v.1.0.12), ggpubr(v.0.4.0), ggfortify(v.0.4.10), kableExtra(v.1.2.1), pander(v.0.6.3), msigdbr(v.7.1.1), fgsea(v.1.14.0), RColorBrewer(v.1.1-2), ggrepel(v.0.8.2), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.2), tibble(v.3.0.3), ggplot2(v.3.3.2), tidyverse(v.1.3.0), scales(v.1.1.1), magrittr(v.1.5), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.1.4.4), BiocGenerics(v.0.34.0), tximport(v.1.16.1), edgeR(v.3.30.3) and limma(v.3.44.3)
loaded via a namespace (and not attached): R.utils(v.2.10.1), tidyselect(v.1.1.0), RSQLite(v.2.2.0), htmlwidgets(v.1.5.1), FactoMineR(v.2.3), grid(v.4.0.2), DESeq(v.1.39.0), munsell(v.0.5.0), statmod(v.1.4.34), DT(v.0.15), withr(v.2.2.0), colorspace(v.1.4-1), highr(v.0.8), knitr(v.1.29), rstudioapi(v.0.11), leaps(v.3.1), ggsignif(v.0.6.0), labeling(v.0.3), GenomeInfoDbData(v.1.2.3), hwriter(v.1.3.2), farver(v.2.0.3), bit64(v.4.0.5), rhdf5(v.2.32.2), rprojroot(v.1.3-2), vctrs(v.0.3.4), generics(v.0.0.2), xfun(v.0.16), R6(v.2.4.1), locfit(v.1.5-9.4), bitops(v.1.0-6), assertthat(v.0.2.1), promises(v.1.1.1), gtable(v.0.3.0), rlang(v.0.4.7), genefilter(v.1.70.0), scatterplot3d(v.0.3-41), splines(v.4.0.2), lazyeval(v.0.2.2), rtracklayer(v.1.48.0), rstatix(v.0.6.0), broom(v.0.7.0), reshape2(v.1.4.4), BiocManager(v.1.30.10), yaml(v.2.2.1), abind(v.1.4-5), modelr(v.0.1.8), backports(v.1.1.9), httpuv(v.1.5.4), tools(v.4.0.2), ellipsis(v.0.3.1), ggdendro(v.0.1.21), plyr(v.1.8.6), Rcpp(v.1.0.5), progress(v.1.2.2), zlibbioc(v.1.34.0), RCurl(v.1.98-1.2), prettyunits(v.1.1.1), openssl(v.1.4.2), viridis(v.0.5.1), zoo(v.1.8-8), cluster(v.2.1.0), haven(v.2.3.1), fs(v.1.5.0), data.table(v.1.13.0), openxlsx(v.4.1.5), reprex(v.0.3.0), truncnorm(v.1.0-8), ProtGenerics(v.1.20.0), aroma.light(v.3.18.0), hms(v.0.5.3), mime(v.0.9), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.5), rio(v.0.5.16), jpeg(v.0.1-8.1), readxl(v.1.3.1), compiler(v.4.0.2), biomaRt(v.2.44.1), crayon(v.1.3.4), R.oo(v.1.24.0), htmltools(v.0.5.0), mgcv(v.1.8-33), later(v.1.1.0.1), geneplotter(v.1.66.0), lubridate(v.1.7.9), DBI(v.1.1.0), MASS(v.7.3-52), rappdirs(v.0.3.1), Matrix(v.1.2-18), car(v.3.0-9), cli(v.2.0.2), R.methodsS3(v.1.8.1), pkgconfig(v.2.0.3), flashClust(v.1.01-2), foreign(v.0.8-80), plotly(v.4.9.2.1), xml2(v.1.3.2), annotate(v.1.66.0), webshot(v.0.5.2), rvest(v.0.3.6), digest(v.0.6.25), rmarkdown(v.2.3), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.3), shiny(v.1.5.0), lifecycle(v.0.2.0), nlme(v.3.1-149), jsonlite(v.1.7.0), Rhdf5lib(v.1.10.1), carData(v.3.0-4), viridisLite(v.0.3.0), askpass(v.1.1), fansi(v.0.4.1), pillar(v.1.4.6), lattice(v.0.20-41), fastmap(v.1.0.1), httr(v.1.4.2), survival(v.3.2-3), GO.db(v.3.11.4), interactiveDisplayBase(v.1.26.3), glue(v.1.4.2), zip(v.2.1.1), png(v.0.1-7), BiocVersion(v.3.11.1), bit(v.4.0.4), stringi(v.1.4.6), blob(v.1.2.1), latticeExtra(v.0.6-29) and memoise(v.1.1.0)